{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# preprocessing imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from time import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.preprocessing.text import text_to_word_sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# functions we implemented\n",
    "from custom_functions import init_embeddings_map, get_embed_and_pad_func"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emb_size = 50\n",
    "embedding_map = init_embeddings_map(\"glove.6B.\" + str(emb_size) + \"d.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv(\"data/unembedded_grouped_cleaned_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train/test split for our model is unique, we need to hold out a\n",
    "# set of users and movies so that our network never learns those \n",
    "test_size = 0.005\n",
    "\n",
    "# get test_size percentage of users\n",
    "unique_users = raw_data.loc[:, \"reviewerID\"].unique()\n",
    "users_size = len(unique_users)\n",
    "test_idx = np.random.choice(users_size,\n",
    "                              size=int(users_size * test_size),\n",
    "                              replace=False)\n",
    "\n",
    "# get test users\n",
    "test_users = unique_users[test_idx]\n",
    "\n",
    "# everyone else is a training user\n",
    "train_users = np.delete(unique_users, test_idx)\n",
    "\n",
    "test = raw_data[raw_data[\"reviewerID\"].isin(test_users)]\n",
    "train = raw_data[raw_data[\"reviewerID\"].isin(train_users)]\n",
    "\n",
    "unique_test_movies = test[\"asin\"].unique()\n",
    "\n",
    "# drop the movies that also appear in our test set. In order to be\n",
    "# a true train/test split, we are forced to discard some data entirely\n",
    "train = train.where(np.logical_not(train[\"asin\"].isin(unique_test_movies))).dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_seq_sizes = raw_data.loc[:, \"userReviews\"].apply(lambda x: x.split()).apply(len)\n",
    "item_seq_sizes = raw_data.loc[:, \"movieReviews\"].apply(lambda x: x.split()).apply(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "u_ptile = 40\n",
    "i_ptile = 15\n",
    "u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))\n",
    "i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)\n",
    "    \n",
    "train_embedded = train.apply(embedding_fn, axis=1)\n",
    "test_embedded = test.apply(embedding_fn, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DeepCoNN Recommendation Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# modeling imports\n",
    "from keras.models import Model\n",
    "from keras.callbacks import EarlyStopping, TensorBoard\n",
    "from keras.layers import LSTM\n",
    "from keras.layers import Input, Dense\n",
    "from keras.layers.merge import Add, Dot, Concatenate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DeepCoNN():\n",
    "    def __init__(self, embedding_size, hidden_size, rnn_hidden_size, u_seq_len, m_seq_len, filters=2, kernel_size=8,\n",
    "                 strides=6):\n",
    "        self.embedding_size = embedding_size\n",
    "        self.hidden_size = hidden_size\n",
    "        self.rnn_hidden_size = rnn_hidden_size\n",
    "        self.filters = filters\n",
    "        self.kernel_size = kernel_size\n",
    "        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)\n",
    "        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)\n",
    "        self.joined = Concatenate()([self.towerU, self.towerM])\n",
    "        self.outNeuron = Dense(1)(self.joined)\n",
    "\n",
    "    def create_deepconn_tower(self, max_seq_len):\n",
    "        input_layer = Input(shape=(max_seq_len, self.embedding_size))\n",
    "        tower = LSTM(self.rnn_hidden_size, activation=\"tanh\")(input_layer)\n",
    "        tower = Dense(self.hidden_size, activation=\"relu\")(tower)\n",
    "        return input_layer, tower\n",
    "\n",
    "    def create_deepconn_dp(self):\n",
    "        dotproduct = Dot(axes=1)([self.towerU, self.towerM])\n",
    "        output = Add()([self.outNeuron, dotproduct])\n",
    "        self.model = Model(inputs=[self.inputU, self.inputM], outputs=[output])\n",
    "        self.model.compile(optimizer='Adam', loss='mse')\n",
    "        \n",
    "    def train(self, train_data, batch_size, epochs=3500):\n",
    "        tensorboard = TensorBoard(log_dir=\"tf_logs/{}\".format(time()))\n",
    "        self.create_deepconn_dp()\n",
    "        print(self.model.summary())\n",
    "        \n",
    "        user_reviews = np.array(list(train_data.loc[:, \"userReviews\"]))\n",
    "        movie_reviews = np.array(list(train_data.loc[:, \"movieReviews\"]))\n",
    "\n",
    "        self.train_inputs = [user_reviews, movie_reviews]\n",
    "        self.train_outputs = train_data.loc[:, \"overall\"]\n",
    "\n",
    "        self.history = self.model.fit(self.train_inputs,\n",
    "                                      self.train_outputs,\n",
    "                                      callbacks=[tensorboard],\n",
    "                                      validation_split=0.05,\n",
    "                                      batch_size=batch_size,\n",
    "                                      epochs=epochs)\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hidden_size = 64\n",
    "rnn_hidden_size = 64\n",
    "deepconn = DeepCoNN(emb_size, hidden_size, rnn_hidden_size, u_seq_len, i_seq_len)\n",
    "\n",
    "batch_size = 32\n",
    "deepconn.train(train_embedded, batch_size, epochs=20)\n",
    "\n",
    "deepconn.model.save(\"lstm.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_reviews = np.array(list(test_embedded.loc[:, \"userReviews\"]))\n",
    "movie_reviews = np.array(list(test_embedded.loc[:, \"movieReviews\"]))\n",
    "\n",
    "test_inputs = [user_reviews, movie_reviews]\n",
    "\n",
    "dat = pd.DataFrame(test_inputs)\n",
    "dat.to_csv(\"data/test_data.csv\")\n",
    "\n",
    "true_rating = np.array(list(test_embedded.loc[:, \"overall\"])).reshape((-1, 1))\n",
    "\n",
    "predictions = deepconn.model.predict(test_inputs)\n",
    "\n",
    "error = np.square(predictions - true_rating)\n",
    "\n",
    "print(\"MSE:\", np.average(error))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
